This challenge requires to discover directed neural connections from time series of neural imagery depicting neuron activation patterns.
More at http://www.kaggle.com/c/connectomics
We start by a listing of the installed python packages using pip freeze, so all of this can be reproduced. We're also setting matplotlib to inline output, and load the cythonmagic extension for IPython (see http://ipython.org/ipython-doc/dev/config/extensions/cythonmagic.html )
In [32]:
!pip freeze
In [2]:
%matplotlib inline
%load_ext cythonmagic
Next, download and extract the data packages linked at http://www.kaggle.com/c/connectomics/data to a suitable data directory
In [1]:
# Some core imports
import os
import sys
from subprocess import call
import time
import pandas
import numpy as np
import h5py
# These are IPython specific
from IPython.display import display, clear_output
In [2]:
# Core configuration
datadir = "/data/quick/connectomics"
mirror = "http://www.causality.inf.ethz.ch/data/connectomics"
nbdir = os.getcwd()
sources = [
"http://www.kaggle.com/c/connectomics/download/normal-1.tgz",
"http://www.kaggle.com/c/connectomics/download/normal-2.tgz",
"http://www.kaggle.com/c/connectomics/download/normal-3.tgz",
"http://www.kaggle.com/c/connectomics/download/normal-4.tgz",
"http://www.kaggle.com/c/connectomics/download/normal-4-lownoise.tgz",
"http://www.kaggle.com/c/connectomics/download/normal-4-lownoise.tgz",
"http://www.kaggle.com/c/connectomics/download/small.tgz",
"http://www.kaggle.com/c/connectomics/download/highcc.tgz",
"http://www.kaggle.com/c/connectomics/download/lowcc.tgz",
"http://www.kaggle.com/c/connectomics/download/lowcon.tgz",
"http://www.kaggle.com/c/connectomics/download/highcon.tgz",
"http://www.kaggle.com/c/connectomics/download/normal-3-highrate.tgz",
"http://www.kaggle.com/c/connectomics/download/test.tgz",
"http://www.kaggle.com/c/connectomics/download/valid.tgz",
"http://www.kaggle.com/c/connectomics/download/sampleSubmission.csv.zip"
]
In [3]:
print "Creating data directory"
call(['mkdir', '-p', datadir + "/downloads"])
call(['mkdir', '-p', datadir + "/tmp"])
os.chdir(datadir + "/tmp")
In [6]:
cmd = ['wget', '-O', 'target', 'src']
for src in sources:
clear_output()
print "Downloading %s" % (src)
sys.stdout.flush()
parts = src.rsplit('/', 1)
filename = parts[-1]
target = "%s/tmp/%s" % (datadir, filename)
mirror_src = "%s/%s" % (mirror, filename)
try:
os.unlink(target)
except:
pass
cmd[2] = target
cmd[3] = mirror_src
#print ' '.join(cmd)
call(cmd)
tgz_cmd = ["tar", "-xf", filename]
print "Extracting %s" % (filename)
sys.stdout.flush()
call(tgz_cmd)
print "Removing %s" % (filename)
sys.stdout.flush()
os.unlink(target)
clear_output()
print "Downloads finished"
We are going to make the data a bit more accessible by loading it into a hierarchical HDF5 data store using the h5py library ( see http://docs.h5py.org/en/latest/ ). While pandas also supports HDF5, it does not natively support multidimensional tensors/matrices which we will use later on in conjunction with Theano.
In [7]:
def import_network(store, name, f_file, p_file, n_file):
print "Importing %s" % (name)
fdata = pandas.read_csv(f_file, sep=',', skipinitialspace=True, header=None, prefix='N', dtype=np.float32)
concatenated = pandas.concat([fdata[c] for c in fdata.columns])
timecount = len(fdata)
neuroncount = len(fdata.columns)
fdata = None
tdata = concatenated.values.reshape((neuroncount, timecount))
g = store.create_group(name)
dset = g.create_dataset("fluorescence", data=tdata, compression='lzf')
g['fluorescence'].dims[0].label = 'neuron'
g['fluorescence'].dims[1].label = 'time'
pdata = pandas.read_csv(p_file, sep=',', skipinitialspace=True, header=None, names=['x','y'], dtype=np.float64)
concatenated = pandas.concat([pdata[c] for c in pdata.columns])
tensor = concatenated.values.reshape((2, len(pdata))).T
pset = g.create_dataset("networkPositions", data=tensor)
if ((n_file is not None) and os.path.isfile(n_file)):
ndata = pandas.read_csv(n_file, sep=',', skipinitialspace=True, header=None, names=['i','j', 'weight'], dtype={ 'i' : np.int32, 'j' : np.int32, 'weight' : np.float32})
connection_matrix = np.zeros((neuroncount, neuroncount), dtype=np.float32)
for idx in range(len(ndata)):
i = int(ndata.i[idx])
j = int(ndata.j[idx])
weight = float(ndata.weight[idx])
if (weight==-1.0):
weight = 0.0
connection_matrix[i-1,j-1] = weight
g.create_dataset("connectionMatrix", data=connection_matrix, compression='lzf')
return g
In [4]:
store = h5py.File(datadir + '/store.h5')
In [10]:
for i in range(1,7):
import_network(store, 'small-%d' % (i),
'fluorescence_iNet1_Size100_CC0%dinh.txt' % (i),
'networkPositions_iNet1_Size100_CC0%dinh.txt' % (i),
'network_iNet1_Size100_CC0%dinh.txt' % (i))
for nname in ['normal-1', 'normal-2', 'normal-3', 'normal-4', 'normal-4-lownoise', 'lowcc', 'highcc', 'highcon', 'test', 'valid']:
import_network(store, nname,
"%s/fluorescence_%s.txt" % (nname, nname),
"%s/networkPositions_%s.txt" % (nname, nname),
"%s/network_%s.txt" % (nname, nname))
print "Done"
print store
That's it, we have the raw data in a nice accessible format in a HDF5 store and can access the individual datasets like this:
In [29]:
store['normal-1']['connectionMatrix']
Out[29]:
In [63]:
from math import sqrt
def create_spatial_distance_matrix(net):
positions = net['networkPositions'].value
def distance(n1, n2):
x1 = positions[n1,0]
y1 = positions[n1,1]
x2 = positions[n2,0]
y2 = positions[n2,1]
return sqrt((x1-x2)*(x1-x2)+(y1-y2)*(y1-y2))
nneurons = positions.shape[0]
distanceMatrix = np.zeros((nneurons, nneurons), dtype=np.float32)
for n1 in range(nneurons):
for n2 in range(positions.shape[0]):
distanceMatrix[n1,n2] = distance(n1,n2)
try:
del net['spatialDistanceMatrix']
except:
pass
net.create_dataset("spatialDistanceMatrix", data=distanceMatrix)
In [105]:
def update_signal_distance_matrix(net):
if (not 'connectionMatrix' in net.keys()):
return False
connections = net['connectionMatrix'].value
try:
del net['signalDistanceMatrix']
except:
pass
inf = float('inf')
signalMatrix = np.zeros_like(connections, dtype=np.float32)
signalMatrix[:,:] = connections[:,:]
signalMatrix[signalMatrix==0.0] = inf
for i in range(signalMatrix.shape[0]):
signalMatrix[i,i] = 0.0
def signal(src, via):
depth = signalMatrix[src,via]
signal = signalMatrix[via,:] + float(depth)
signalMatrix[src,:] = np.minimum(signalMatrix[src,:], signal)
for i in range(1000):
orig = signalMatrix.copy()
for src in range(signalMatrix.shape[0]):
for via in range(signalMatrix.shape[0]):
if (via==target):
continue
if (signalMatrix[src,via]>=inf):
continue
signal(src, via)
if (np.all(signalMatrix==orig)):
break
net.create_dataset("signalDistanceMatrix", data=signalMatrix)
print "Done updating"
In [ ]:
from sys import stdout
for nwname in store.keys():
net = store[nwname]
print "Network: %s" % (nwname)
if (('networkPositions' in net.keys())):
print "Calculating spatial distance matrix"
stdout.flush()
create_spatial_distance_matrix(net)
print "Calculating signal distance matrix"
stdout.flush()
update_signal_distance_matrix(net)
Let's verify that the code is correct, by counter-checking with the shortest paths given by the NetworkX Library:
In [89]:
import networkx as nx
def create_nx_graph(net):
''' Load a neural network with known connectivity as a NetworkX directed graph'''
if (not 'connectionMatrix' in net):
return None
G = nx.DiGraph()
conn = net['connectionMatrix']
for i in range(conn.shape[0]):
G.add_node(i)
for i in range(conn.shape[0]):
for j in range(conn.shape[0]):
if (conn[i,j]>0.0):
G.add_edge(i,j)
return G
In [104]:
# Simple test case: Check all connections of the small-1 network
G = create_nx_graph(store['small-1'])
for i in range(100):
for j in range(100):
try:
plen = len(nx.shortest_path(G, i, j))-1
assert plen == store['small-1']['signalDistanceMatrix'][i,j]
except:
assert store['small-1']['signalDistanceMatrix'][i,j]==float('inf')
print "Test passed"
In [5]:
store.close()